{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Parallel sum benchmark\n",
    "\n",
    "This notebook performs a simple parallel sum benchmark with both CPUs and GPUs using Dask. You can adjust the model parameters and the number of CPU cores or GPUs to use below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define model parameters for the parallel sum\n",
    "xdim = 500000\n",
    "ydim = 500000\n",
    "x_chunk_size = 10000\n",
    "y_chunk_size = 10000\n",
    "\n",
    "# Define how many CPU cores or GPUs we will use\n",
    "num_cpu_cores = 8\n",
    "num_gpus = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import libraries for interacting with Dask and CUDA\n",
    "import time\n",
    "import cupy\n",
    "import dask\n",
    "import dask.array as da\n",
    "from dask_cuda import LocalCUDACluster\n",
    "from dask.distributed import Client, LocalCluster"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Common functions for performing both CPU and GPU benchmark\n",
    "\n",
    "def create_data(rs, xdim, ydim, x_chunk_size, y_chunk_size):\n",
    "    x = rs.normal(10, 1, size=(xdim, ydim), chunks=(x_chunk_size, y_chunk_size))\n",
    "    return x\n",
    "\n",
    "def run(data):\n",
    "    (data + 1)[::2, ::2].sum().compute()\n",
    "    return"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run GPU benchmark\n",
    "cluster = LocalCUDACluster(n_workers=num_gpus)\n",
    "client = Client(cluster)\n",
    "\n",
    "start = time.time()\n",
    "print(\"Allocating and initializing arrays using GPU memory with CuPY\")\n",
    "rs = da.random.RandomState(RandomState=cupy.random.RandomState)\n",
    "x = create_data(rs, xdim, ydim, x_chunk_size, y_chunk_size)\n",
    "print('Array size: {:.2f} TB.  Computing parallel sum . . .'.format(x.nbytes/1e12))\n",
    "run(x)\n",
    "end = time.time()\n",
    "delta = end - start\n",
    "\n",
    "print(\"GPU parallel sum complete!\")\n",
    "print(\"Wall time create data + computation time: {:10.8f} seconds\".format(delta))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run CPU benchmark\n",
    "cluster = LocalCluster(n_workers=num_cpu_cores)\n",
    "client = Client(cluster)\n",
    "\n",
    "start = time.time()\n",
    "print(\"Allocating and initializing arrays using CPU memory\")\n",
    "rs = da.random.RandomState()\n",
    "x = create_data(rs, xdim, ydim, x_chunk_size, y_chunk_size)\n",
    "print('Array size: {:.2f} TB.  Computing parallel sum . . .'.format(x.nbytes/1e12))\n",
    "run(x)\n",
    "end = time.time()\n",
    "delta = end - start\n",
    "\n",
    "print(\"GPU parallel sum complete!\")\n",
    "print('Wall time create data + computation time: {:10.8f} seconds'.format(delta))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
